Import Libary
In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default='notebook'
import seaborn as sns
Load Data
In [ ]:
# load the source data from Ontario.ca
url = "https://www.ontario.ca/public-sector-salary-disclosure/pssd-assets/files/2022/tbs-pssd-compendium-salary-disclosed-2022-en-utf-8-2023-03-24.csv"
df_raw = pd.read_csv(url)
In [ ]:
df_raw.head()
Out[ ]:
| Sector | Last Name | First Name | Salary | Benefits | Employer | Job Title | Year | |
|---|---|---|---|---|---|---|---|---|
| 0 | Colleges | Whitaker | Christopher | 552946.10 | 64388.76 | Humber College Institute Of Technology and Adv... | President | 2022 |
| 1 | Colleges | Morrison | Janet | 459810.99 | 11340.95 | Sheridan College Institute Of Technology and A... | President and Vice Chancellor | 2022 |
| 2 | Colleges | Sado | Anne | 454202.32 | 412.56 | George Brown College Of Applied Arts and Techn... | President, Emeritus | 2022 |
| 3 | Colleges | Agnew | David | 422065.42 | 18299.46 | Seneca College Of Applied Arts and Technology | President | 2022 |
| 4 | Colleges | Tibbits | John | 409900.00 | 923.07 | Conestoga College Institute Of Technology and ... | President | 2022 |
In [ ]:
# remove "seconded" companies which is temporary from the sunshine list
df_raw = df_raw[df_raw.apply(lambda x: 'seconded' not in x['Sector'].lower(),1)]
# count how many people each company entered the sunshine list in 2022
df_company_entrycount = df_raw.groupby('Employer')['Salary'].count().sort_values(ascending=False).reset_index()
# draw a box plot to help filter out the minorities
fig = plt.figure(figsize =(5, 3))
plt.boxplot(df_company_entrycount['Salary'].values)
plt.xlabel("Sunshine List Entry")
plt.ylabel("# of Entries")
plt.show()
It looks like most of the sunshine list population is occupied by the few top companies, let's check.
In [ ]:
ratio = 0.9 # set a ratio to test the portion
# Sunshine list total people
Total1 = df_raw.shape[0]
print('There are {} people in total entered sunshine list in 2022, and {}% of that is {}.'.format(Total1,
ratio*100,
int(Total1*ratio)))
# Sunshine list total company
Total2 = df_company_entrycount.shape[0]
print('There are {} companies in total entered sunshine list in 2022.'.format(Total2))
####
df_company_entrycount['cumsum'] = df_company_entrycount['Salary'].cumsum()
df_company_topperc = df_company_entrycount[df_company_entrycount['cumsum']<=Total1*ratio]
print("The top {} companies from all the {} companies, which is only {}%, account for {}% of the entire population in the sunshine list.".format(df_company_topperc.index[-1],
Total2,
round(df_company_topperc.index[-1]/Total2*100,2),
ratio*100))
# obtain a list of the top majority companies
majority_companies = df_company_topperc['Employer'].values
There are 266770 people in total entered sunshine list in 2022, and 90.0% of that is 240093. There are 2232 companies in total entered sunshine list in 2022. The top 236 companies from all the 2232 companies, which is only 10.57%, account for 90.0% of the entire population in the sunshine list.
So, 10.5% of the companies account for 90% of all the entries in the sunshine list, which is even more extreme than the "20-80" law.
Consolidate Sectors¶
- Some sectors need to be combined, such as Government of Ontario subs
- All Ontario Power Generation sectors will change to "Crown Agencies"
In [ ]:
def sector_change(x):
if 'Government of Ontario' in x:
return 'Government of Ontario'
elif 'Ontario Power Generation' in x:
return 'Crown Agencies'
else:
return x
df_raw = df_raw.rename(columns = {'Sector':'Sector_Original'})
df_raw['Sector'] = df_raw['Sector_Original'].apply(lambda x: sector_change(x))
Find Top N Companies in Each Sector¶
- Omit 'Seconded' sector which is temporary
- Only picked the top companies that account for the 90% of the sunshine list population
- Ranked based on the average salary
In [ ]:
df_raw
Out[ ]:
| Sector_Original | Last Name | First Name | Salary | Benefits | Employer | Job Title | Year | Sector | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Colleges | Whitaker | Christopher | 552946.10 | 64388.76 | Humber College Institute Of Technology and Adv... | President | 2022 | Colleges |
| 1 | Colleges | Morrison | Janet | 459810.99 | 11340.95 | Sheridan College Institute Of Technology and A... | President and Vice Chancellor | 2022 | Colleges |
| 2 | Colleges | Sado | Anne | 454202.32 | 412.56 | George Brown College Of Applied Arts and Techn... | President, Emeritus | 2022 | Colleges |
| 3 | Colleges | Agnew | David | 422065.42 | 18299.46 | Seneca College Of Applied Arts and Technology | President | 2022 | Colleges |
| 4 | Colleges | Tibbits | John | 409900.00 | 923.07 | Conestoga College Institute Of Technology and ... | President | 2022 | Colleges |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 266898 | Universities | De Fazio | Arin | 100008.68 | 208.08 | University Of Toronto | Business Officer | 2022 | Universities |
| 266899 | Universities | Cadieux | Michelle | 100007.92 | 246.67 | McMaster University | Instructional Assistant / Sessional Faculty | 2022 | Universities |
| 266900 | Universities | Critoph | Elise | 100006.96 | 0.00 | University Of Ottawa | Professeur(e) adjoint(e) / Assistant Professor | 2022 | Universities |
| 266901 | Universities | Marsh | Jeffrey | 100000.08 | 0.00 | Laurentian University Of Sudbury | Associé(e) de recherche / Research Associate | 2022 | Universities |
| 266902 | Universities | Vera | Dusya | 100000.00 | 1029.80 | University Of Western Ontario | Professor | 2022 | Universities |
266770 rows × 9 columns
In [ ]:
# filter the top 10% companies that account for 90% of the sunshine list population
df_majority = df_raw[df_raw['Employer'].isin(majority_companies)]
# rank partitions
df_top_company = df_majority.groupby(['Sector','Employer'])['Salary'].mean().reset_index().sort_values(['Sector','Salary'], ascending= False).reset_index(drop = True)
df_top_company['Rank'] = df_top_company.groupby('Sector').cumcount()+1
# take out top N in each sector
N=7
df_top_company = df_top_company[df_top_company['Rank']<=N]
In [ ]:
df_top_company.head(50)
Out[ ]:
| Sector | Employer | Salary | Rank | |
|---|---|---|---|---|
| 0 | Universities | McMaster University | 166822.142135 | 1 |
| 1 | Universities | Queen’s University | 164340.277778 | 2 |
| 2 | Universities | University Of Toronto | 164150.478942 | 3 |
| 3 | Universities | University Of Western Ontario | 162056.389025 | 4 |
| 4 | Universities | York University | 162009.121341 | 5 |
| 5 | Universities | Brock University | 161613.305000 | 6 |
| 6 | Universities | University Of Waterloo | 160125.714246 | 7 |
| 19 | School Boards | Conseil Scolaire Catholique Franco-Nord | 111707.577616 | 1 |
| 20 | School Boards | Huron-Superior Catholic District School Board | 111251.266057 | 2 |
| 21 | School Boards | Hastings And Prince Edward District School Board | 110984.748144 | 3 |
| 22 | School Boards | Conseil Scolaire Public Du Grand Nord De L’ont... | 110480.456094 | 4 |
| 23 | School Boards | Conseil Scolaire Catholique Des Grandes Rivières | 109977.921855 | 5 |
| 24 | School Boards | Keewatin-Patricia District School Board | 109943.819870 | 6 |
| 25 | School Boards | Nipissing-Parry Sound Catholic District School... | 109932.869152 | 7 |
| 82 | Other Public Sector Employers | ORNGE | 133779.237306 | 1 |
| 83 | Other Public Sector Employers | Canadian Institute For Health Info | 126061.004938 | 2 |
| 84 | Other Public Sector Employers | Municipal Property Assessment Corporation | 125422.239395 | 3 |
| 85 | Other Public Sector Employers | Children’s Aid Society Of Toronto | 116979.580521 | 4 |
| 86 | Municipalities & Services | City Of Ottawa - Police Services | 140473.341470 | 1 |
| 87 | Municipalities & Services | Town Of Whitby | 139354.577070 | 2 |
| 88 | Municipalities & Services | City Of St Catharines | 135941.197087 | 3 |
| 89 | Municipalities & Services | City Of Toronto - Police Service | 135818.996993 | 4 |
| 90 | Municipalities & Services | City Of Greater Sudbury | 135155.077504 | 5 |
| 91 | Municipalities & Services | City Of Oshawa | 134892.790878 | 6 |
| 92 | Municipalities & Services | Regional Municipality Of Peel Police Services | 134416.044886 | 7 |
| 135 | Hospitals & Boards of Public Health | The Hospital For Sick Children | 141840.190289 | 1 |
| 136 | Hospitals & Boards of Public Health | Waypoint Centre For Mental Health Care | 140404.897309 | 2 |
| 137 | Hospitals & Boards of Public Health | St. Joseph’s Care Group | 137216.052395 | 3 |
| 138 | Hospitals & Boards of Public Health | St. Joseph’s Health Care London | 130589.325661 | 4 |
| 139 | Hospitals & Boards of Public Health | Sinai Health System | 129318.319524 | 5 |
| 140 | Hospitals & Boards of Public Health | Ontario Shores Centre For Mental Health Sciences | 127028.438267 | 6 |
| 141 | Hospitals & Boards of Public Health | Centre For Addiction And Mental Health | 126588.935057 | 7 |
| 188 | Government of Ontario | Ontario Court Of Justice | 248796.409095 | 1 |
| 189 | Government of Ontario | Attorney General | 178025.093279 | 2 |
| 190 | Government of Ontario | Ontario Provincial Police | 135262.569400 | 3 |
| 191 | Government of Ontario | Legislative Assembly | 134999.801554 | 4 |
| 192 | Government of Ontario | Economic Development, Job Creation and Trade | 126082.388205 | 5 |
| 193 | Government of Ontario | Solicitor General | 123554.543257 | 6 |
| 194 | Government of Ontario | Health | 122424.568368 | 7 |
| 206 | Crown Agencies | Ontario Securities Commission | 172268.180071 | 1 |
| 207 | Crown Agencies | Ontario Power Generation | 154761.504684 | 2 |
| 208 | Crown Agencies | Ontario Infrastructure And Lands Corporation (... | 148839.112619 | 3 |
| 209 | Crown Agencies | Independent Electricity System Operator | 147583.833009 | 4 |
| 210 | Crown Agencies | Financial Services Regulatory Authority of Ont... | 145279.757098 | 5 |
| 211 | Crown Agencies | Ontario Lottery And Gaming Corporation | 141196.325491 | 6 |
| 212 | Crown Agencies | Alcohol And Gaming Commission Of Ontario | 136683.544000 | 7 |
| 220 | Colleges | Georgian College Of Applied Arts and Technology | 121859.262324 | 1 |
| 221 | Colleges | Humber College Institute Of Technology and Adv... | 121829.540590 | 2 |
| 222 | Colleges | George Brown College Of Applied Arts and Techn... | 121301.492447 | 3 |
| 223 | Colleges | Seneca College Of Applied Arts and Technology | 121250.636950 | 4 |
Perform word wrap for the employer name. This will help the yaxis ticker in the final plot look uniformed.
In [ ]:
def word_wrap(x, line_chars = 30):
words = x.split(' ')
lines = []
line=''
for w in words:
if len(line)+len(w)+1>line_chars:
line+=' '*(line_chars-len(line))
lines.append(line)
line=''
else:
line+=' '+w
if line:
line+=' '*(line_chars-len(line))
lines.append(line)
wrap_x = '<br>'.join(lines)
return wrap_x
df_top_company['Employer_Wrap'] = df_top_company['Employer'].apply(word_wrap)
Visualize Top N Companys' Salary Distribution in Each Sector¶
In [ ]:
colors = sns.color_palette(palette=None, n_colors=N).as_hex()
# sector = ['Universities']
for sector in df_top_company['Sector'].unique():
df_companies = df_top_company[df_top_company['Sector']==sector]
companies = df_companies['Employer'].values
df_sample = df_raw[df_raw['Employer'].isin(companies)]
df_sample = df_sample.merge(df_companies[['Employer','Employer_Wrap','Rank']], on = 'Employer', how = 'left')
df_sample = df_sample[['Employer_Wrap','Job Title','Salary','Rank']].sort_values(['Rank','Salary'])
X = df_sample['Employer_Wrap'].unique()
X = X[::-1] # reverse the list to show the highest one at the top
Y = [df_sample[df_sample['Employer_Wrap']==i]['Salary'].values for i in X]
Title = [df_sample[df_sample['Employer_Wrap']==i]['Job Title'].values for i in X]
fig = go.Figure()
for xd, yd, td, cls in zip(X, Y, Title, colors):
fig.add_trace(go.Box(
x=yd,
name=xd,
boxpoints='all', # whether to show all points or other choices
jitter=0.6, # sample points width relative to the box width
boxmean = True,
whiskerwidth=0.3,
fillcolor=cls,
marker_size=1.2,
line_width=1.5,
pointpos =0, # sample points location relative to the box [-2,2]
width = 0.6, # box width
notched=True, # wether to display a notch for the median
customdata=td, # used for hover
hovertemplate='<br>Salary: %{x}<br>Job: %{customdata}'
)
)
fig.update_layout(
title='Employee Salaries over $100k of the Top {} Company in the "<b>{}</b>" Sector'.format(N,sector),
autosize=False,
width=1000,
height=int(500*(1+(N-5)/5)), # the height is normalized to N=5, and will dynamically adjust based on N size
xaxis=dict(
range=[95000, 250000], # set and uniform the initial xaxis zoom (can adjust in plot)
rangeslider=dict(
visible=True,
thickness = 0.05
)
),
# yaxis=dict(
# autorange=True,
# showgrid=True,
# zeroline=True,
# dtick=5,
# gridcolor='rgb(255, 255, 255)',
# gridwidth=1,
# zerolinecolor='rgb(255, 255, 255)',
# zerolinewidth=2,
# ),
# margin=dict(
# l=40,
# r=30,
# b=80,
# t=100,
# ),
paper_bgcolor='rgb(243, 243, 243)',
plot_bgcolor='rgb(243, 243, 243)',
showlegend=False
)
fig.show()
In [ ]: